# WORDSCORES

# Install and load packages

install.packages("quanteda")
remotes::install_github("quanteda/quanteda.textmodels")
remotes::install_github("quanteda/quanteda.textplots") 
install.packages("fixest")
install.packages("dplyr")
install.packages("stringr")

library(dplyr)
library(quanteda)
library(fixest)
library(quanteda.textplots)
library(quanteda.textmodels)
library(stringr)

# Load data

setwd("FOLDER PATH")
load('df_ft_all_final.rda')

# ALL PERIODS - COMBINED

df_ft_all$speech <- df_ft_all$speech_edit

# Make duplicates of references docs
df_ft_all_duplicates <- df_ft_all[!is.na(df_ft_all$reference_wordscore), ]
df_ft_all_duplicates$duplicate_wordscore <- 1

# Subset data for correct dates
df_ft_all_combined_wordscore <- df_ft_all
df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  filter(date >= as.Date("1910-01-01") & date <= as.Date("1919-08-11"))
df_ft_all_combined_wordscore <- filter(df_ft_all_combined_wordscore, 
                                   arbejdsløshedsstøtte > 0 | dyrtid > 0 | arbejdsdag > 0 | merindkomstskat > 0)

# Combine original and duplicate data
df_ft_all_combined_wordscore <- bind_rows(df_ft_all_combined_wordscore, df_ft_all_duplicates)

# Removing speaker from speech
# Make speakers into standard format
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\)", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\(", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\{", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\[", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\]", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\}", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\\\", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\*", "")

# Remove speaker from speech
for (i in 1:nrow(df_ft_all_combined_wordscore)) {
  speaker <- df_ft_all_combined_wordscore$speaker[i]
  speech <- df_ft_all_combined_wordscore$speech[i]
  
  speaker_position <- str_locate(speech, speaker)
  
  if (!is.na(speaker_position[1, 1])) {
    start_pos <- speaker_position[1, 1]
    end_pos <- speaker_position[1, 2]
    
    df_ft_all_combined_wordscore$speech[i] <- paste0(substr(speech, 1, start_pos - 1), substr(speech, end_pos + 1, nchar(speech)))
  }
}
for (i in 1:nrow(df_ft_all_combined_wordscore)) {
  speech <- df_ft_all_combined_wordscore$speech[i]
  
  if (str_detect(substr(speech, 1, 70), ":")) {
    df_ft_all_combined_wordscore$speech[i] <- substr(speech, str_locate(speech, ":")[1, 2] + 1, nchar(speech))
  }
}

for (i in 1:nrow(df_ft_all_combined_wordscore)) {
  speech <- df_ft_all_combined_wordscore$speech[i]
    if (str_detect(substr(speech, 1, 70), "\\)\\.")) {
    df_ft_all_combined_wordscore$speech[i] <- substr(speech, str_locate(speech, ":")[1, 2] + 1, nchar(speech))
  }
}

# Make tokens for speaker
tokens_speaker <- tokens(df_ft_all_combined_wordscore$speaker)

# Create a data frame that maps party values to grouped values
party_mapping <- data.frame(
  original_party = c(2, 3, 4, 5, 7, 8, 13, 14, 15, 16, 102),
  grouped_party = c(2, 3, 4, 5, 3, 3, 4, 4, 4, 3, 5)
)

# Merge the party mapping data frame to map parties to grouped parties
df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  left_join(party_mapping, by = c("party" = "original_party"))

# Create a new column 'post_1917' and post-1915 and update 'party' accordingly
df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  mutate(post_rev = ifelse(date > as.Date("1917-03-08"), TRUE, FALSE),
         grouped_party = ifelse(post_rev, paste0(grouped_party, "17"), grouped_party))

df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  mutate(post_1915 = ifelse((date >= as.Date("1915-05-31") & date <= as.Date("1917-03-07")), TRUE, FALSE),
         grouped_party = ifelse(post_1915, paste0(grouped_party, "15"), grouped_party))

# Remove all other variables except "party" and "speech" and "reference_wordscore"
df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  select(party, grouped_party, speech, reference_wordscore, duplicate_wordscore)

df_ft_all_combined_wordscore$grouped_party <- as.numeric(df_ft_all_combined_wordscore$grouped_party)

# Filter rows to keep only the grouped parties (2, 3, 4, 5)
df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  filter((is.na(duplicate_wordscore) | duplicate_wordscore == 1) & !is.na(grouped_party) & grouped_party %in% c(2, 3, 4, 5, 217, 317, 417, 517, 215, 315, 415, 515))

# Make unique groups for duplicates
df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  mutate(
    grouped_party = if_else(!is.na(duplicate_wordscore) & duplicate_wordscore == 1, row_number(), grouped_party)
  )

df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>% # Removing rows with references scores but are not used for referencing, otherwise creates bias
  filter(!(is.na(duplicate_wordscore) & !is.na(reference_wordscore)))

#Group by grouped_party
grouped_df_wordscore <- df_ft_all_combined_wordscore %>%
  mutate(
    grouped_party = if_else(is.na(duplicate_wordscore), grouped_party, row_number())
  ) %>%
  filter(!is.na(grouped_party)) %>%
  group_by(grouped_party) %>%
  summarize(concatenated_speech = paste(speech, collapse = " "), 
            reference_wordscore = first(reference_wordscore))

# Reset the row names
rownames(grouped_df_wordscore) <- NULL

corp1 <- corpus(grouped_df_wordscore, text_field = "concatenated_speech")
docnames(corp1) <- grouped_df_wordscore$grouped_party

toks <- tokens(corp1, remove_punct = TRUE)

# Convert the list of speaker tokens into a character vector
tokens_vector <- unlist(tokens_speaker)

# Remove the speaker tokens from the dfm
dfmat_toks <- dfm(toks) %>% 
  dfm_remove(pattern = tokens_vector) %>%
  dfm_remove(pattern = stopwords("da"))

dfmat_toks2 <-dfm_keep(dfmat_toks, pattern = featnames(dfmat_toks)[docfreq(dfmat_toks) > 5]) %>% 
  dfm_subset(ntoken(.) > 0)

tmod_ws <- textmodel_wordscores(dfmat_toks, y = corp1$reference_wordscore, smooth = 1)
summary(tmod_ws)

pred_ws <- predict(tmod_ws, se.fit = TRUE, newdata = dfmat_toks)
pred_ws

pred_ws$fit <- pred_ws$fit
pred_ws$se.fit <- pred_ws$se.fit

textplot_scale1d(pred_ws)

pred_ws$fit <- pred_ws$fit[1:12]
pred_ws$se.fit <- pred_ws$se.fit[1:12]

textplot_scale1d(pred_ws)

setwd("FOLDER PATH")
save(pred_ws, file = "/wordscores_combined.RData")

# WORDSCORES FOR SUBSET OF DATA

df_ft_all$speech <- df_ft_all$speech_edit

# Make duplicates of references docs
df_ft_all_duplicates <- df_ft_all[!is.na(df_ft_all$reference_wordscore), ]
df_ft_all_duplicates$duplicate_wordscore <- 1

# Subset data for correct dates
df_ft_all_1019_wordscore <- df_ft_all
df_ft_all_1019_wordscore <- subset(df_ft_all_1019_wordscore, format(as.Date(date),"%Y-%m-%d")<'1919-08-12')
df_ft_all_1019_wordscore <- filter(df_ft_all_1019_wordscore, 
                                   arbejdsløshedsstøtte > 0 | dyrtid > 0 | arbejdsdag > 0 | merindkomstskat > 0)

# Combine original and duplicate data
df_ft_all_1019_wordscore <- bind_rows(df_ft_all_1019_wordscore, df_ft_all_duplicates)

# Iterate through each row of the data frame and remove the first speaker occurrence
df_ft_all_1019_wordscore$speaker <- str_replace_all(df_ft_all_1019_wordscore$speaker, "\\)", "")
df_ft_all_1019_wordscore$speaker <- str_replace_all(df_ft_all_1019_wordscore$speaker, "\\(", "")
df_ft_all_1019_wordscore$speaker <- str_replace_all(df_ft_all_1019_wordscore$speaker, "\\{", "")
df_ft_all_1019_wordscore$speaker <- str_replace_all(df_ft_all_1019_wordscore$speaker, "\\[", "")
df_ft_all_1019_wordscore$speaker <- str_replace_all(df_ft_all_1019_wordscore$speaker, "\\}", "")
df_ft_all_1019_wordscore$speaker <- str_replace_all(df_ft_all_1019_wordscore$speaker, "\\\\", "")
df_ft_all_1019_wordscore$speaker <- str_replace_all(df_ft_all_1019_wordscore$speaker, "\\*", "")

for (i in 1:nrow(df_ft_all_1019_wordscore)) {
  speaker <- df_ft_all_1019_wordscore$speaker[i]
  speech <- df_ft_all_1019_wordscore$speech[i]
  
  # Find the position of the speaker in speech
  speaker_position <- str_locate(speech, speaker)
  
  if (!is.na(speaker_position[1, 1])) {
    # Get the position of the first occurrence of the speaker
    start_pos <- speaker_position[1, 1]
    end_pos <- speaker_position[1, 2]
    
    # Remove the first occurrence of speaker in speech
    df_ft_all_1019_wordscore$speech[i] <- paste0(substr(speech, 1, start_pos - 1), substr(speech, end_pos + 1, nchar(speech)))
  }
}

# Iterate through each row of the data frame
for (i in 1:nrow(df_ft_all_1019_wordscore)) {
  speech <- df_ft_all_1019_wordscore$speech[i]
  
  # Check if ":" is within the first 30 characters
  if (str_detect(substr(speech, 1, 70), ":")) {
    # Remove characters up to and including the first ":"
    df_ft_all_1019_wordscore$speech[i] <- substr(speech, str_locate(speech, ":")[1, 2] + 1, nchar(speech))
  }
}

# Iterate through each row of the data frame
for (i in 1:nrow(df_ft_all_1019_wordscore)) {
  speech <- df_ft_all_1019_wordscore$speech[i]
  
  # Check if ":" is within the first 30 characters
  if (str_detect(substr(speech, 1, 70), "\\)\\.")) {
    # Remove characters up to and including the first ":"
    df_ft_all_1019_wordscore$speech[i] <- substr(speech, str_locate(speech, ":")[1, 2] + 1, nchar(speech))
  }
}

# Create a data frame that maps party values to grouped values
party_mapping <- data.frame(
  original_party = c(2, 3, 4, 5, 7, 8, 13, 14, 15, 16, 102),
  grouped_party = c(2, 3, 4, 5, 3, 3, 4, 4, 4, 3, 5)
)


# Merge the party mapping data frame to map parties to grouped parties
df_ft_all_1019_wordscore <- df_ft_all_1019_wordscore %>%
  left_join(party_mapping, by = c("party" = "original_party"))


# Filter rows to keep only the grouped parties (2, 3, 4, 5)
df_ft_all_1019_wordscore <- df_ft_all_1019_wordscore %>%
  filter((is.na(duplicate_wordscore) | duplicate_wordscore == 1) & !is.na(grouped_party) & grouped_party %in% c(2, 3, 4, 5))

# Make unique groups for duplicates
df_ft_all_1019_wordscore <- df_ft_all_1019_wordscore %>%
  mutate(
    grouped_party = if_else(!is.na(duplicate_wordscore) & duplicate_wordscore == 1, row_number(), grouped_party)
  )

df_ft_all_1019_wordscore <- df_ft_all_1019_wordscore %>%
  filter(!(is.na(duplicate_wordscore) & !is.na(reference_wordscore)))

corp1 <- corpus(df_ft_all_1019_wordscore, text_field = "speech")

toks <- tokens(corp1, remove_punct = TRUE)

dfmat_toks <- dfm(toks)%>% 
  dfm_remove(pattern = stopwords("da"))

tmod_ws <- textmodel_wordscores(dfmat_toks, y = corp1$reference_wordscore, smooth = 1)
summary(tmod_ws)

pred_ws <- predict(tmod_ws, se.fit = TRUE, newdata = dfmat_toks)
pred_ws

textplot_scale1d(pred_ws)

df_ft_all_1019_wordscore$wordscores_score <- pred_ws$fit

# Calculate the mean and standard deviation of wordscores_score
mean_score <- mean(df_ft_all_1019_wordscore$wordscores_score, na.rm = TRUE)
sd_score <- sd(df_ft_all_1019_wordscore$wordscores_score, na.rm = TRUE)

# Create the ws_dummy column
df_ft_all_1019_wordscore$ws_dummy <- ifelse(
  df_ft_all_1019_wordscore$wordscores_score >= mean_score+1*sd_score,
  1, 0
)

table(df_ft_all_1019_wordscore$ws_dummy)

library(dplyr)
df_ft_all_1019_wordscore <- df_ft_all_1019_wordscore %>%
  mutate(y17_19 = as.numeric(date >= as.Date('1917-03-08') & date <= as.Date('1919-08-11')))

df_ws_dummy_1 <- df_ft_all_1019_wordscore[df_ft_all_1019_wordscore$ws_dummy == 1, ]
table(df_ws_dummy_1$y17_19)

table(df_ft_all_1019_wordscore$ws_dummy)

setwd("FOLDER PATH")
save(df_ws_dummy_1, file = "/df_ws_dummy.RData")